knitr::opts_knit$set(root.dir = "/Users/TARDIS/Documents/STUDIES/context_word_seg")
library(ProjectTemplate)
load.project()
######
# WL #
######
WL <- df_WL %>%
mutate_each(funs(ifelse(. > 0, 1, 0)), -utt, -orth, -phon) # for word list analysis only, make all contexts either 1 or 0 (smoothing over 1.5's from expand_windows)
WL_sum <- data.frame(context = names(colSums(dplyr::select(WL, -utt, -orth, -phon))), n=colSums(dplyr::select(WL, -utt, -orth, -phon))) %>%
mutate(method="word lists")
WL_sum %>%
select(-method) %>%
arrange(desc(n)) %>%
kable()
| context | n |
|---|---|
| body_touch | 1530 |
| meal | 942 |
| fussing | 741 |
| bath | 664 |
| bed | 570 |
| play | 528 |
| diaper_dressing | 471 |
| media | 20 |
######
# HJ #
######
HJ <- df_HJ_bin
HJ_sum <- data.frame(context=names(colSums(dplyr::select(HJ, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(df_HJ_bin, -utt, -orth, -phon), na.rm=TRUE)) %>%
mutate(method="coder judgments")
HJ_sum %>%
select(-method) %>%
arrange(desc(n)) %>%
kable()
| context | n |
|---|---|
| playtime | 4789 |
| interaction | 4515 |
| fussing | 2739 |
| mealtime | 1849 |
| bathtime | 1215 |
| none | 914 |
| diaper change | 663 |
| dressing | 412 |
| sleep | 410 |
| touching | 270 |
| housework | 269 |
| taking pictures | 113 |
| hiccups | 85 |
| outside | 45 |
| TV | 43 |
| sick | 18 |
| coughing | 15 |
| swimming | 15 |
| stretching | 13 |
| sneezing | 10 |
| music | 6 |
| drooling | 4 |
| blowing bubbles | 2 |
| friends | 1 |
| pre-doctors visit | 1 |
######
# TM #
######
LDA <- df_LDA_bin
LDA_sum <- data.frame(context=names(colSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm=TRUE)) %>%
mutate(method="LDA")
LDA_sum %>%
select(-method) %>%
arrange(desc(n)) %>%
kable()
| context | n |
|---|---|
| topic_11 | 2292 |
| topic_5 | 1887 |
| topic_3 | 1616 |
| topic_4 | 1544 |
| topic_2 | 1478 |
| topic_7 | 1251 |
| topic_6 | 1187 |
| topic_9 | 1176 |
| topic_8 | 1090 |
| topic_12 | 1020 |
| topic_1 | 736 |
| topic_10 | 613 |
STM <- df_STM_bin
STM_sum <- data.frame(context=names(colSums(dplyr::select(STM, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(STM, -utt, -orth, -phon), na.rm=TRUE)) %>%
mutate(method="STM")
STM_sum %>%
select(-method) %>%
arrange(desc(n)) %>%
kable()
| context | n |
|---|---|
| topic_3 | 2174 |
| topic_10 | 1938 |
| topic_4 | 1775 |
| topic_7 | 1733 |
| topic_1 | 1575 |
| topic_2 | 1392 |
| topic_8 | 1066 |
| topic_6 | 930 |
| topic_9 | 845 |
| topic_5 | 840 |
| topic_12 | 548 |
| topic_11 | 540 |
all_sum <- rbind(WL_sum, HJ_sum, LDA_sum, STM_sum)
ggplot(all_sum, aes(y=n, x=reorder(as.factor(context), n))) +
geom_bar(stat = 'identity', show.legend = FALSE) +
geom_hline(aes(yintercept=nrow(df)), lty=2) +
facet_wrap(~method, scales = "free_x", ncol=1) +
ggtitle("Number of utterances in each context\ndashed line shows total corpus size")
ggsave(filename ="utts_per_context.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")
######
# WL #
######
WL$N.contexts <- rowSums(dplyr::select(WL, -utt, -orth, -phon))
table(WL$N.contexts)
##
## 0 1 2 3
## 8334 3924 693 52
WL <- extract_contexts(WL)
summary(WL$context) # the number of utterances in each context
## ambiguous bath bed body_touch
## 745 440 377 1147
## diaper_dressing fussing meal media
## 295 557 706 12
## no context tag play
## 8334 390
length(which(WL$N.contexts > 2)) / length(WL$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.003999077
length(which(WL$N.contexts == 0)) / length(WL$N.contexts) # percent of corpus with 0 contexts
## [1] 0.640929
length(which(WL$N.contexts > 2)) / length(which(WL$N.contexts > 0))
## [1] 0.01113729
length(which(WL$N.contexts > 1)) / length(which(WL$N.contexts > 0))
## [1] 0.1595631
######
# HJ #
######
HJ$N.contexts <- rowSums(dplyr::select(HJ, -utt, -orth, -phon), na.rm = TRUE)
table(HJ$N.contexts)
##
## 0 1 2 3
## 550 6889 5165 399
HJ <- extract_contexts(HJ)
summary(HJ$context) # the number of utterances in each context
## ambiguous bathtime diaper change dressing
## 5564 864 289 148
## fussing hiccups housework interaction
## 807 17 95 1325
## mealtime no context tag none outside
## 800 550 169 6
## playtime sick sleep taking pictures
## 2189 2 126 11
## touching TV
## 22 19
length(which(HJ$N.contexts > 2)) / length(HJ$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.03068523
length(which(HJ$N.contexts == 0)) / length(HJ$N.contexts) # percent of corpus with 0 contexts
## [1] 0.04229793
######
# TM #
######
LDA$N.contexts <- rowSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm = TRUE)
table(LDA$N.contexts)
##
## 0 1 2 3
## 31 9100 3350 30
LDA <- extract_contexts(LDA)
summary(LDA$context) # the number of utterances in each context
## ambiguous no context tag topic_1 topic_10 topic_11
## 3380 31 286 300 1440
## topic_12 topic_2 topic_3 topic_4 topic_5
## 570 968 610 1065 1137
## topic_6 topic_7 topic_8 topic_9
## 679 837 758 450
length(which(LDA$N.contexts > 2)) / length(LDA$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00239789
length(which(LDA$N.contexts == 0)) / length(LDA$N.contexts) # percent of corpus with 0 contexts
## [1] 0.00247782
STM$N.contexts <- rowSums(dplyr::select(STM, -utt, -orth, -phon), na.rm = TRUE)
table(STM$N.contexts)
##
## 0 1 2 3
## 213 9300 2938 60
STM <- extract_contexts(STM)
summary(STM$context) # the number of utterances in each context
## ambiguous no context tag topic_1 topic_10 topic_11
## 2998 213 621 936 390
## topic_12 topic_2 topic_3 topic_4 topic_5
## 428 750 1432 1178 540
## topic_6 topic_7 topic_8 topic_9
## 540 1256 766 463
length(which(STM$N.contexts > 2)) / length(STM$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00479578
length(which(STM$N.contexts == 0)) / length(STM$N.contexts) # percent of corpus with 0 contexts
## [1] 0.01702502
###########
# OVERALL #
###########
wl_contexts <- dplyr::select(WL, utt, N.contexts, context) %>%
mutate(method = "word list")
hj_contexts <- dplyr::select(HJ, utt, N.contexts, context) %>%
mutate(method = "coder judgments")
lda_contexts <- dplyr::select(LDA, utt, N.contexts, context) %>%
mutate(method = "LDA")
stm_contexts <- dplyr::select(STM, utt, N.contexts, context) %>%
mutate(method = "STM")
all_methods_counts <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>%
count(method, N.contexts)
ggplot(all_methods_counts, aes(y=n, x=method, fill=as.factor(N.contexts))) +
geom_bar(stat = "identity") +
labs(title = "Number of contexts tagged per utterance", y="Number of utterances", x = "Context defined by") +
scale_fill_discrete(name="Number of contexts")
ggsave(filename ="contexts_per_utt_1.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")
HJ_no_none <- dplyr::select(HJ, -none, -N.contexts, -context)
HJ_no_none$N.contexts <- rowSums(dplyr::select(HJ_no_none, -utt, -orth, -phon), na.rm = TRUE)
HJ_no_none <- extract_contexts(HJ_no_none)
hj_no_none <- dplyr::select(HJ_no_none, utt, N.contexts, context) %>%
mutate(method = "coder judgments")
all_methods_counts_no_none <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>%
count(method, N.contexts)
ggplot(all_methods_counts_no_none, aes(y=n, x=method, fill=as.factor(N.contexts))) +
geom_bar(stat = "identity") +
labs(title = "Number of contexts tagged per utterance,\nnot including 'none' for coder judgments", y="Number of utterances", x = "Context defined by") +
scale_fill_discrete(name="Number of contexts")
ggsave(filename ="contexts_per_utt_2.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")
table(hj_contexts$N.contexts)
##
## 0 1 2 3
## 550 6889 5165 399
table(hj_no_none$N.contexts)
##
## 0 1 2 3
## 719 7357 4636 291
all_methods_context_count <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>%
count(method, context)
all_methods_context_count$context <- relevel(all_methods_context_count$context, "none" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "no context tag" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "ambiguous" )
ggplot(all_methods_context_count, aes(y=n, x=method, fill=context)) +
geom_bar(stat= "identity") +
scale_fill_manual(values = c("#0072B2", "#D55E00", "#E69F00", rep("#999999", length(levels(all_methods_context_count$context)) - 3)))
Printing context files for sharing with CF:
all <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>%
dplyr::select(utt, context, method) %>%
tidyr::extract(utt, into = c("child", "age", "utt.num"), regex = "^([[:alpha:]]{2})([[:digit:]]{2})[.]cha_([[:digit:]]+)$")
all$utt.num <- as.numeric(all$utt.num)
all %>%
dplyr::filter(method=="word list") %>%
tidyr::spread(key=utt.num, value=context) %>%
dplyr::select(-method) %>%
write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_WL.csv", row.names=FALSE)
all %>%
dplyr::filter(method=="coder judgments") %>%
tidyr::spread(key=utt.num, value=context) %>%
dplyr::select(-method) %>%
write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_HJ.csv", row.names=FALSE)
all %>%
dplyr::filter(method=="LDA") %>%
tidyr::spread(key=utt.num, value=context) %>%
dplyr::select(-method) %>%
write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_LDA.csv", row.names=FALSE)
all %>%
dplyr::filter(method=="STM") %>%
tidyr::spread(key=utt.num, value=context) %>%
dplyr::select(-method) %>%
write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_STM.csv", row.names=FALSE)
Dropping “none” codes from HJ.
WL <- WL
HJ <- HJ_no_none
LDA <- LDA
STM <- STM
The word list method can (and does) leave large portions of the corpus uncoded, unlike the coder judgment or topic modeling definitions of context. Coders were instructed to respond with “none” if they encountered a section of the corpus where they could not identify any activity context; there are 914 utterances in the corpus that are above threshold for the “none” code, indicating agreement across coders that those utterances do not have an identifiable activity context. This is a much smaller portion of the corpus (7%) than the uncoded section left by the word list method (64.1%), however.
The word list and coder definitions of context naturally produce a skewed distribution of activities, whereas the topic modeling approaches discover a more uniform distribution of activities. To the extent that the distritbution of activities is naturally skewed (e.g. that a few activities happen very often, and many others happen rarely), topic modeling approaches to identifying activity context may distort reality.
contexts <- names(WL_contexts)
WL_context_data <- data.frame(NULL)
for(k in contexts){
WL_context_data <- rbind(WL_context_data, data.frame(word=WL_contexts[[k]], context=k, stringsAsFactors = FALSE))
}
orth_stream <- paste(df$orth, collapse = " ")
# flag bigrams from WL keywords in orth stream, so they don't get separated
for(w_bar in grep(x = WL_context_data$word, pattern = "_", value=TRUE)){
w_space <- gsub(x=w_bar, pattern="_", replacement=" ")
orth_stream <- gsub(x=orth_stream, pattern = w_space, replacement = w_bar)
}
orth_stream <- strsplit(orth_stream, split=" ")[[1]]
orth_stream <- orth_stream[orth_stream != ""]
orth_data <- data.frame(word=orth_stream, stringsAsFactors = FALSE) %>%
count(word)
WL_context_data <- left_join(WL_context_data, orth_data, by="word") %>%
arrange(context, n)
WL_context_data %>%
group_by(context) %>%
summarise(total=sum(n, na.rm=TRUE),
mean.freq=mean(n, na.rm=TRUE),
highest=max(n, na.rm=TRUE),
which.highest=word[which.max(n)]) %>%
kable()
| context | total | mean.freq | highest | which.highest |
|---|---|---|---|---|
| bath | 287 | 15.105263 | 47 | bath |
| bed | 150 | 13.636364 | 43 | tired |
| body_touch | 550 | 20.370370 | 123 | tickle |
| diaper_dressing | 116 | 6.444444 | 50 | nappie |
| fussing | 216 | 19.636364 | 56 | ssh |
| meal | 303 | 8.416667 | 27 | eat |
| media | 4 | 2.000000 | 3 | television |
| play | 136 | 7.157895 | 65 | play |
for(k in HJ_contexts_list){
message(k)
cloud_from_df(df_HJ_bin, k)
}
## bathtime
## blowing bubbles
## coughing
## diaper change
## dressing
## drooling
## friends
## fussing
## hiccups
## housework
## interaction
## mealtime
## music
## none
## outside
## playtime
## pre-doctors visit
## sick
## sleep
## sneezing
## stretching
## swimming
## taking pictures
## touching
## TV
df_HJ_none <- df_HJ_bin
df_HJ_none$HJ_none <- ifelse(df_HJ_none$none==1, 1,
ifelse(df_HJ_none$none==0, 0, NA))
df_HJ_none <- dplyr::select(df_HJ_none, utt, HJ_none)
df_WL_0 <- WL
df_WL_0$WL_0 <- ifelse(df_WL_0$N.contexts == 0, 1,
ifelse(df_WL_0$N.contexts > 0, 0, NA))
df_WL_0 <- dplyr::select(df_WL_0, utt, WL_0)
match <- full_join(df_HJ_none, df_WL_0, by="utt") %>%
mutate(match = HJ_none + WL_0)
match$match <- ifelse(match$match == 2, 1,
ifelse(match$match < 2, 0, NA))
nrow(df_WL_0); nrow(df_HJ_none); nrow(match)
## [1] 13003
## [1] 13003
## [1] 13003
tab <- xtabs( ~ WL_0 + HJ_none, data = match)
addmargins(tab)
## HJ_none
## WL_0 0 1 Sum
## 0 4405 178 4583
## 1 7423 736 8159
## Sum 11828 914 12742
summary(tab)
## Call: xtabs(formula = ~WL_0 + HJ_none, data = match)
## Number of cases in table: 12742
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 116.29, df = 1, p-value = 4.101e-27
mosaic(tab)
assocplot(tab)
assocstats(tab)
## X^2 df P(> X^2)
## Likelihood Ratio 127.25 1 0
## Pearson 116.29 1 0
##
## Phi-Coefficient : 0.096
## Contingency Coeff.: 0.095
## Cramer's V : 0.096
What percent of the “none” context utterances in HJ method are 0 context in WL?
sum(match$match, na.rm=TRUE) / sum(match$HJ_none, na.rm=TRUE)
## [1] 0.8052516
What percent of the 0 context utterances in WL method are “none” context in HJ?
sum(match$match, na.rm=TRUE) / sum(match$WL_0, na.rm=TRUE)
## [1] 0.08831293
What words are associated with each LDA topic?
top_words_lda <- top.topic.words(lda$topics, num.words=10) %>%
as.data.frame(stringsAsFactors = FALSE)
colnames(top_words_lda) <- paste0("topic_", 1:ncol(top_words_lda))
top_words_lda %>%
kable()
| topic_1 | topic_2 | topic_3 | topic_4 | topic_5 | topic_6 | topic_7 | topic_8 | topic_9 | topic_10 | topic_11 | topic_12 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| oh | hey | go | yes | hello | yes | oh | ya | oh | oh | mm | oh |
| hey | oh | hey | oh | hmm | oh | come | go | go | yes | mummi | yes |
| dear | mummi | want | well | oh | tell | dear | come | come | ah | yes | tickl |
| hannah | yes | can | boo | dear | go | yes | oh | yes | dear | eh | dear |
| alright | hannah | hmm | hello | got | hey | tell | littl | ya | one | girl | go |
| trea | hello | come | go | gillian | come | good | got | eh | clean | hello | come |
| ssh | look | nice | say | matter | dear | got | yes | bath | two | come | got |
| yum | hold | see | come | smile | look | darl | look | alright | wee | oh | big |
| darl | smile | smile | dear | look | stori | girl | like | get | dirti | good | see |
| like | shh | littl | clever | hey | daddi | mum | want | like | now | alright | good |
LDAvis: http://bl.ocks.org/rosemm/raw/a7b1ac43ffe3b49229ed5e866762613f/
# http://cpsievert.github.io/LDAvis/reviews/reviews.html
alpha <- 0.1 # from lda package demo
eta <- 0.1 # from lda package demo
theta <- t(apply(lda$document_sums + alpha, 2, function(x) x/sum(x)))
phi <- t(apply(t(lda$topics) + eta, 2, function(x) x/sum(x)))
D <- length(TM_doc_prep_out$documents) # number of documents
W <- length(TM_doc_prep_out$vocab) # number of terms in the vocab
doc.length <- document.lengths(TM_doc_prep_out$documents) # number of tokens per document
N <- sum(doc.length) # total number of tokens in the data
term.frequency <- word.counts(TM_doc_prep_out$documents, vocab = TM_doc_prep_out$vocab)
lda_data <- list(phi = phi,
theta = theta,
doc.length = doc.length,
vocab = TM_doc_prep_out$vocab,
term.frequency = as.integer(term.frequency))
json <- createJSON(phi = phi,
theta = theta,
doc.length = doc.length,
vocab = TM_doc_prep_out$vocab,
term.frequency = as.integer(term.frequency))
serVis(json, as.gist = TRUE)
Wordles, showing the frequencies of words in the utterances assigned to each topic.
cloud_from_df(df_LDA_bin, "topic_1")
cloud_from_df(df_LDA_bin, "topic_2")
cloud_from_df(df_LDA_bin, "topic_3")
cloud_from_df(df_LDA_bin, "topic_4")
cloud_from_df(df_LDA_bin, "topic_5")
cloud_from_df(df_LDA_bin, "topic_6")
cloud_from_df(df_LDA_bin, "topic_7")
cloud_from_df(df_LDA_bin, "topic_8")
cloud_from_df(df_LDA_bin, "topic_9")
cloud_from_df(df_LDA_bin, "topic_10")
cloud_from_df(df_LDA_bin, "topic_11")
## Warning in wordcloud::wordcloud(words, freq, min.freq = min.freq): you
## could not be fit on page. It will not be plotted.
cloud_from_df(df_LDA_bin, "topic_12")
What words are associated with each STM topic?
summary(stm)
## A topic model with 12 topics, 427 documents and a 531 word dictionary.
## Topic Words:
## Topic 1: side, lost, dear, round, ha, said, teddi
## Topic 2: bop, hello, monkey, give, hi, thing, say
## Topic 3: shh, hey, shake, rabbit, els, chou, light
## Topic 4: door, lunch, minut, sit, lucki, sweetheart, watch
## Topic 5: tickl, toe, feet, din, tick, tum, boy
## Topic 6: away, one, nappi, will, wee, sorri, milk
## Topic 7: hair, water, bad, fun, minut, deari, drink
## Topic 8: bum, bath, splash, swim, shake, dad, mum
## Topic 9: boo, bye, ah, clever, hide, see, tea
## Topic 10: hand, see, naughti, fed, can, bite, look
## Topic 11: juic, whether, duck, quack, miss, leav, madam
## Topic 12: dear, got, oh, hello, shame
##
## Covariate Words:
## Group cr: grumpi, gut, gorgeous, hmm, full, fast, shall
## Group gi: jumper, throat, without, keep, christoph, bash, reach
## Group gl: precious, handi, snooz, temper, mat, slept, happi
## Group la: hannah, busi, usual, nosh, prove, mummi, whose
## Group st: mobil, oven, cradl, appl, pussycat, cupsi, shoulder
##
## Topic-Covariate Interactions:
## Topic 1, Group cr: pretti, forget, nail, whee, let, dress, tear
## Topic 1, Group gi: clap, hand, hiccough, might, hah, christoph, left
## Topic 1, Group gl: struggl, walk, lost, aw, danc, punch, chin
## Topic 1, Group la: dub, bib, dear, bath, someth, hurt, whee
## Topic 1, Group st: hmm, stuck, okay, push, myron, kick, camera
##
## Topic 2, Group cr: ticki, shake, excit, bore, gillian, chou, camera
## Topic 2, Group gi: oop, us, stuff, bash, christoph, hello, friend
## Topic 2, Group gl: hmm
## Topic 2, Group la: pretti, skirt, seen, sunshin, kick, hello, bop
## Topic 2, Group st: foot, attent, grab, must, nose, matter, play
##
## Topic 3, Group cr: shush, matter, pleas, dolli, gillian, ah, hey
## Topic 3, Group gi: littl, re, wind, kick, shake, lad, chou
## Topic 3, Group gl: bop, rest, talk, oop, tri, bore, bib
## Topic 3, Group la: polli, near, thank, oop, bubbl, treasur, ho
## Topic 3, Group st: food, swing, smile, myron, suppos, move, ssh
##
## Topic 4, Group cr: girl, love, nose, bubbl, littl, bless
## Topic 4, Group gi: gonna, lad, ticklish, clean, never, mum, hous
## Topic 4, Group gl: first, know, ssh, stuff, wet, will, happen
## Topic 4, Group la: iron, arm, downstair, cuddl, god, mm, quiet
## Topic 4, Group st: daddi, goin, readi, burp, wind, ya, wash
##
## Topic 5, Group cr: belli, soft, wee, stori, din, nose, shout
## Topic 5, Group gi: clever, parrot, yum, complain, sack, daisi, terribl
## Topic 5, Group gl: find, tum, old, hah, dinner, thumb, lie
## Topic 5, Group la: excit, three, beebo, two, tum, get, one
## Topic 5, Group st: feet, toe, boy
##
## Topic 6, Group cr: sleepi, huh, head, shout, dolli, eye, chair
## Topic 6, Group gi: burp, suck, vest, carri, roll, bib, finger
## Topic 6, Group gl: hide, astra, three, ah, daisi, hiccup, chang
## Topic 6, Group la: sunshin, clean, wee, straight, nappi, morn, one
## Topic 6, Group st: charl, bib, ever, gonna, forget, tri, whee
##
## Topic 7, Group cr: gonna, tell, cross, patch, shout, madam, stori
## Topic 7, Group gi: fatti, relax, grab, ow, walk, lift, dad
## Topic 7, Group gl: petal, tale, pop, pet, bath, wash, dirti
## Topic 7, Group la: dub, stori, tell, daddi, rub, mum, wash
## Topic 7, Group st: lulu, hot, bottl, wide, juic, garden, first
##
## Topic 8, Group cr: bath, bum
## Topic 8, Group gi: arm, struggl, naughti, stick, nail, wash, let
## Topic 8, Group gl: stori, wee, bubbl, next, ha, aw, mess
## Topic 8, Group la: will, splish, splosh, tonight, whee, air, mm
## Topic 8, Group st: say, anyth, cuddl, myron, shake, milk, hello
##
## Topic 9, Group cr: oop, cross, patch, shush, sack, re, babi
## Topic 9, Group gi: parrot, everyth, best, mouth, cup, teeth, hair
## Topic 9, Group gl: astra, myron, anoth, tomorrow, yeah, clean, one
## Topic 9, Group la: lambchop, sleepi, watch, chou, love, girl, tri
## Topic 9, Group st: hide, stand, ah, littl, bye
##
## Topic 10, Group cr: toe, stretch, game, play, can, tongu, goin
## Topic 10, Group gi: clap, splash, shout, push, stand, soon, ya
## Topic 10, Group gl: pretti, pram, can, bear, afternoon, play, tell
## Topic 10, Group la: lambchop, chou, friend, littl, smile, hmm, daddi
## Topic 10, Group st: polli, lewi, pastri, joseph, rain, though, thank
##
## Topic 11, Group cr: attent, ya, bib, matter, think, minut, alright
## Topic 11, Group gi: duck, quack, bath, will, look, left, minut
## Topic 11, Group gl: get, sunshin, dress, tongu, arm, stori, never
## Topic 11, Group la: paddi, donald, yum, quack, came, point, silli
## Topic 11, Group st: meringu, dub, pie, lemon, rub, joseph, cupboard
##
## Topic 12, Group cr: shame, burp, fatti, hicki, mind, poor, never
## Topic 12, Group gi: nose, stretch, daddi, size, christoph
## Topic 12, Group gl: smile, daddi, dad, bless, tis, stay, girli
## Topic 12, Group la: ssh, trea, pie, alright, armi, darl, funni
## Topic 12, Group st:
##
# http://cpsievert.github.io/LDAvis/reviews/reviews.html
toLDAvis(stm, TM_doc_prep_out$documents, as.gist = TRUE) # This function does not yet allow content covariates.
Wordles, showing the probability of each word given the topic.
cloud(stm, topic=1)
cloud(stm, topic=2)
cloud(stm, topic=3)
cloud(stm, topic=4)
cloud(stm, topic=5)
cloud(stm, topic=6)
cloud(stm, topic=7)
cloud(stm, topic=8)
cloud(stm, topic=9)
cloud(stm, topic=10)
cloud(stm, topic=11)
cloud(stm, topic=12)
Wordles, showing the frequencies of words in the utterances assigned to each topic.
cloud_from_df(df_STM_bin, "topic_1")
cloud_from_df(df_STM_bin, "topic_2")
cloud_from_df(df_STM_bin, "topic_3")
cloud_from_df(df_STM_bin, "topic_4")
cloud_from_df(df_STM_bin, "topic_5")
cloud_from_df(df_STM_bin, "topic_6")
cloud_from_df(df_STM_bin, "topic_7")
cloud_from_df(df_STM_bin, "topic_8")
cloud_from_df(df_STM_bin, "topic_9")
cloud_from_df(df_STM_bin, "topic_10")
cloud_from_df(df_STM_bin, "topic_11")
cloud_from_df(df_STM_bin, "topic_12")